library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data("txhousing")
df <- txhousing
variable_names <- names(df)
print(variable_names)
[1] "city" "year" "month" "sales" "volume" "median" "listings" "inventory"
[9] "date"
df$New_Var <- df$sales * df$median
num_cities <- length(unique(df$city))
print(num_cities)
[1] 46
There are 46 unique cities
missing_sales <- sum(is.na(df$sales))
print(missing_sales)
[1] 568
There are 568 missing sales
df_2010 <- subset(df, year == 2010)
avg_sales_2010 <- aggregate(sales ~ city, data = df_2010, mean)
names(avg_sales_2010)[2] <- "average_sales"
print(avg_sales_2010)
# Sum of sales per year
sales_per_year <- aggregate(sales ~ year, data = df, sum)
# Create a line graph
ggplot(sales_per_year, aes(x = year, y = sales)) +
geom_line(color = "blue") +
ggtitle("Total number of sales in different years") +
xlab("Years") +
ylab("Total Number of Sales") +
theme_minimal()
\[ y = \beta_0 +\beta_1 * x1 +\beta_2 * x2 +\beta_3 * x3 + u \]
# Filter for Victoria
victoria_data <- df[df$city == "Victoria", ]
lin.model <- lm(log(sales) ~ listings + year + median, data = victoria_data)
summary(lin.model)
Call:
lm(formula = log(sales) ~ listings + year + median, data = victoria_data)
Residuals:
Min 1Q Median 3Q Max
-0.58623 -0.12597 0.01247 0.15680 0.63317
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 4.265e+01 1.870e+01 2.280 0.023753 *
listings -2.942e-04 2.628e-04 -1.120 0.264379
year -1.946e-02 9.405e-03 -2.069 0.039967 *
median 6.108e-06 1.559e-06 3.917 0.000127 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.2275 on 181 degrees of freedom
(2 observations deleted due to missingness)
Multiple R-squared: 0.1546, Adjusted R-squared: 0.1406
F-statistic: 11.03 on 3 and 181 DF, p-value: 1.093e-06
Median is the most significant, Year and intercept are a little significant. listing is not significant.
One problem with the code is that it is using a single equals, it needs to use a double equals sign for comparison. The pipe operator used is new and you may not have a new enough version of R to use it, the other one is safer.
library(plotly)
Loading required package: ggplot2
Attaching package: ‘ggplot2’
The following object is masked _by_ ‘.GlobalEnv’:
economics_long
Attaching package: ‘plotly’
The following object is masked from ‘package:ggplot2’:
last_plot
The following object is masked from ‘package:stats’:
filter
The following object is masked from ‘package:graphics’:
layout
aplease
library(maps)
library(ggplot2)
library(dplyr)
Attaching package: ‘dplyr’
The following objects are masked from ‘package:stats’:
filter, lag
The following objects are masked from ‘package:base’:
intersect, setdiff, setequal, union
data("canada.cities")
canada_cities_filtered <- canada.cities %>%
filter(country.etc != "Canada")
first_graph <- ggplot(canada_cities_filtered, aes(x = long, y = lat, color = country.etc)) +
geom_point() +
labs(title = "Cities in North America (Excluding Canada)",
x = "Longitude",
y = "Latitude",
color = "Country") +
theme_minimal()
# Convert ggplot to an interactive plotly plot
interactive_plot <- ggplotly(first_graph)
# Print the interactive plot
print(interactive_plot)
NULL
install.packages("titanic")
trying URL 'https://cran.rstudio.com/bin/macosx/big-sur-arm64/contrib/4.4/titanic_0.1.0.tgz'
Content type 'application/x-gzip' length 88521 bytes (86 KB)
==================================================
downloaded 86 KB
The downloaded binary packages are in
/var/folders/bq/dkdbc7qs7g1d203tqt73k9300000gn/T//RtmpvxYC7V/downloaded_packages
library(titanic)
Attaching package: ‘titanic’
The following object is masked _by_ ‘.GlobalEnv’:
titanic_train
data("titanic_train")
titanic_train$Pclass <- as.numeric(titanic_train$Pclass)
titanic_train$Sex <- ifelse(titanic_train$Sex == "male", 1, 0)
titanic_train$Survived <- as.numeric(titanic_train$Survived)
titanic_train$Parch <- as.numeric(titanic_train$Parch)
# Fit the logistic regression model
model <- glm(Survived ~ Pclass + Sex + Age + Parch, data = titanic_train, family = binomial)
# Display the summary of the model
summary(model)
Call:
glm(formula = Survived ~ Pclass + Sex + Age + Parch, family = binomial,
data = titanic_train)
Coefficients:
Estimate Std. Error z value Pr(>|z|)
(Intercept) 5.200300 0.516675 10.065 < 2e-16 ***
Pclass -1.287049 0.139186 -9.247 < 2e-16 ***
Sex -2.585173 0.214296 -12.064 < 2e-16 ***
Age -0.038228 0.007747 -4.934 8.04e-07 ***
Parch -0.147168 0.115579 -1.273 0.203
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
(Dispersion parameter for binomial family taken to be 1)
Null deviance: 964.52 on 713 degrees of freedom
Residual deviance: 645.60 on 709 degrees of freedom
(177 observations deleted due to missingness)
AIC: 655.6
Number of Fisher Scoring iterations: 5
Estimated Coefficients: (Intercept): 5.200300 Pclass: -1.287049 Sexmale: -2.585173 Age: -0.038228 Parch: -0.147168
Pclass, sex, age are significant. Parch is not significant since its P value is 0.203.
library(tidyr)
# Load the dataset
data("economics_long", package = "ggplot2")
# Drop the column 'value01'
economics_long <- economics_long %>%
select(-value01) # Adjust this line if 'value01' does not exist
# Convert from long to wide format
econW <- economics_long %>%
pivot_wider(names_from = date, values_from = value)
# Get the dimensions of the wide format dataframe
print(dim(econW))
[1] 5 575
There are 5 rows and 575 columns